#!/usr/bin/perl

# ------------------------------------------------------------------------------
# --- Split clstr output from CD-HIT so we can parallelize 
# ---  duplicate_homologous_SQL.pl
# ------------------------------------------------------------------------------

use strict;
use warnings;

my $in_clstr = "pdb_dssp_seq_95.fa.clstr";
my $clstr_per_file = 100;

open (CLST, "<$in_clstr")
	or die "ERROR: Could not open input cluster file, $in_clstr\n";

my $clust_num;
my $file_num = -1;

while (my $line = <CLST>) {

	if ($line =~ /^>Cluster (\d+)/) {
		$clust_num = $1;
		
		if ($clust_num % $clstr_per_file == 0) {
		
			# Start new file
			$file_num++;
			my $new_out = 'split_cluster/' . $in_clstr . '_' . $file_num;
			open (OUT, ">$new_out") 
				or die "ERROR: Couldn't open output cluster file, $new_out.\n";
		
		}
		
	} 

	print OUT $line;
}

exit;